


"""
Filename: B2_DownloadEDGAR.py
Goal: Download and process SEC EDGAR log files to determine the temporal validity of institutional investor IP addresses and track proxy filing access patterns.
Contact: mjha@gsu.edu (author) and gormley@wustl.edu

Best Practices:
The SEC enforces rate limits on EDGAR system access. Excessive request rates trigger automatic IP blocking. Implement few seconds delays between requests.
This code requires repeated execution across multiple sessions/days. Use high computing resources for processing large log files (cluster computing recommended).

Data Sources
1. IP Address Registry (Proprietary)
File: parent_ip.csv
Source: Digital Elements (proprietary)
Description: Organizations registered to each IP address as of December 31, 2016
Purpose: Identify which institutional investors control specific IP addresses
Coverage: Point-in-time snapshot; IP assignments may change over time

2. Holdings Data (Generated)
File: HoldingFromCRSP_CIK.dta
Source: Created by code in A1_CreateHolding
Description: Institutional holdings data from CRSP linked to SEC CIK identifiers
Purpose: Determine which institutions should have accessed which proxy filings

3. Meeting Details (Generated)
File: MeetingDetails.dta
Source: Created by code in B1_MeetingDetails
Description: Shareholder meeting characteristics and proposal details
Purpose: Identify meetings of interest to institutional investors

4. Filing Dates (Public)
File: filingdate.dta
Source: SEC EDGAR Companyidx files
Description: Filing dates for each DEF14A (proxy statement) submission
Purpose: Establish timeline for when proxy materials became publicly available
Access: Freely available from SEC EDGAR system
"""

#### Part 1 Download from SEC
import os
import sys
if os.getcwd() == "YOUR_FOLDER":
    num = int(sys.argv[1])
else: 
    num = 4012 ### number of days before June 30th, 2017
    
#make date
from datetime import datetime, timedelta
secbegin = datetime(2017, 6, 30)
fdate = secbegin - timedelta(days=num) #file date
fyear = fdate.strftime("%Y")
fmonth = fdate.strftime("%m")
fday = fdate.strftime("%d")
fname = fyear + fmonth + fday
fquarter = str((fdate.month - 1)//3 + 1)

# exit if file exists
if os.path.exists('SEC/count/c' + fname + '.csv'):
    print('result file exists')
    sys.exit()

# dowload zip file
import pandas as pd
import re
regex = re.compile("\d+\.\d+\.\d+|$")

if os.path.exists('SEC/zip/log' + fname + '.zip'):
    print('zip file exists')
else: 
    zipurl = 'https://www.sec.gov/dera/data/Public-EDGAR-log-file-data/' + \
        fyear + '/Qtr' + fquarter + '/log' + fname + '.zip'
        
    # save zip file
    import wget
    wget.download(zipurl, 'SEC/zip/log' + fname + '.zip')
    print('zip file downloaded')
    
# extract, but delete later
import zipfile
with zipfile.ZipFile('SEC/zip/log' + fname + '.zip', 'r') as zip_ref:
    zip_ref.extractall('SEC/tmp/')
        
#get data
logf = pd.read_csv('SEC/tmp/log' + fname + '.csv', 
                   usecols=['ip', 'date', 'time', 'accession', 'cik', 'extention', 'size', 'idx']) 

os.remove('SEC/tmp/log' + fname + '.csv')
print("File Removed!")

# merge - this keeps only institutions ip address
logf['3ip'] = logf.apply(lambda row: regex.search(row.ip).group(), axis=1)
levip = pd.read_csv('parent_ip_pseudo.csv') ### this data needs to be purchased
### for example 199.253.64.XXX for BlackRock
logf = logf.merge(levip, left_on='3ip', right_on='3ip') 

# remove bots
logf['ipcount'] = logf.groupby('ip')['ip'].transform('count')
logf = logf[logf['ipcount'] < 1001 ]

# remove index, ico, xml, 500 byte and above size
logf = logf[logf['idx'] < 1] 
logf = logf[~logf.extention.str.contains('.xml')]
logf = logf[~logf.extention.str.contains('.ico')]
logf = logf[logf['size'] > 500] 

# combine within 5 minutes
def timem(text):
    tlist = text.split(":")
    return int(tlist[0])*60 + int(tlist[1])

logf = logf.sort_values(by=['ip', 'accession', 'time'])
logf.head()
logf['timem'] = logf.apply(lambda row: timem(row.time), axis=1)
logf['timediff'] = logf['timem'] - logf['timem'].shift(1)
logf['ipdiff'] = logf['ip'] == logf['ip'].shift(1)
logf['accessiondiff'] = logf['accession'] == logf['accession'].shift(1)
logf = logf.loc[~((logf['ipdiff'] == True) & (logf['accessiondiff'] == True) \
                  & (logf['timediff'] < 6)),:] 

logf['bcount'] = logf.groupby(['institution', 'cik'])['cik'].transform('count') 
logf = logf[['date', 'institution', 'cik', 'bcount']]
logf.to_csv('SEC/count/c' + fname + '.csv')
print("Counts Done!")



# Part 2: Till when are IP addresses of parents valid
#### 2.1 Figure out what percent of market value was viewed by parents every quarter
import pandas as pd

########## Get list of parent_names
df_parents = pd.read_csv("parent_ip_pseudo.csv", encoding="ISO-8859-9")
df_parents = df_parents.drop_duplicates(subset=["level"])
parent_list = df_parents[["level"]].rename(columns={"level": 'parent_name'})

df_holdings = pd.read_stata("HoldingFromCRSP_CIK.dta") ### we created in stata

df_holdings = df_holdings.rename(columns={'month': 'monthstat'})
df_holdings['month'] = (df_holdings['monthstat'] % 12) + 1
df_holdings['year']  = 1960 + (df_holdings['monthstat'] // 12) ##stata year begins 1960
print(df_holdings)

# Filter for quarter ends (months 3, 6, 9, 12)
df_holdings = df_holdings[df_holdings['month'] % 3 == 0]

# Calculate market value percentage
df_holdings['mv_per'] = (df_holdings['mv_total'] / df_holdings['tna']) * 100

### keep parents that we have IP address for
merged_df = pd.merge(df_holdings, parent_list, on="parent_name", how='inner')
merged_df = merged_df.sort_values(by=["parent_name", 'year', 'month', 'mv_per', 'cik'])

# Keep specific columns and filter year < 2018
qhold = merged_df.loc[merged_df['year'] < 2018, ["parent_name", 'year', 'month', 'mv_per', 'cik']]
qhold[['year', 'month', 'cik']] = qhold[['year', 'month', 'cik']].astype(int)
qhold = qhold.rename(columns={'parent_name': 'level'}) ##quarterly holdings

#### run for each quarter you are inteerested in
num = 30 ##number indicates the quarter we are looking at
import math
year = 2003 + math.floor((num-1)/4)    
month = (num%4)*3  
if month == 0: month = 12

from datetime import datetime, timedelta
from calendar import monthrange
qend = datetime(year, month, monthrange(year, month)[1])

# Figure out how much of their holdings each parent company viewed
qhold = qhold[qhold['year'] == year]
qhold = qhold[qhold['month'] == month] 

qhold['checked'] = 0 # create a column to see if the cik was checked
for i in range(0,91): ## one for each day of the quarter
    
    fdate = qend - timedelta(days=i) #file date
    fyear = fdate.strftime("%Y")
    fmonth = fdate.strftime("%m")
    fday = fdate.strftime("%d")
    fname = fyear + fmonth + fday
    #print(fname)
    
    try:
        daydf = pd.read_csv('../SEC/counts/c' + fname + '.csv')
    except:
        print('missing ' + fname)
        continue
    
    # merge to see which ciks they checked
    daydf = daydf[['level', 'cik']]
    qhold = pd.merge(qhold, daydf, left_on=['level', 'cik'], 
                        right_on=['level', 'cik'], how='left', indicator = True)
    qhold.loc[qhold._merge == "both", 'checked'] = 1
    qhold = qhold.drop(columns=['_merge'])
    print(qhold['checked'].sum())

#Once we are done with the quarter, lets sum the market value percent
qhold = qhold[qhold['checked'] == 1]
qhold['hold_checked'] = qhold.groupby('level')['mv_per'].transform('sum')

qhold = qhold[['year', 'month', 'level', 'hold_checked']]
qhold.drop_duplicates(subset='level', keep='first', inplace=True)
qhold.to_csv('maintable_sur/' + str(num) + '_.csv')

##### once all the quarters are done - combine them
import os
import pandas as pd
allcsv = os.listdir('maintable_sur') 

# get column names
csv = allcsv[1]
counts = pd.read_csv('maintable_sur/' + csv) 
counts = counts.iloc[0:0] ## get column names - year, month, level, hold_checked

for csv in allcsv:
    count = pd.read_csv('maintable_sur/' + csv) #count
    counts = counts.append(count)

####### Part 2.2 Now check if we have two consecutive quarters of low views
###### If a fund family does not access more than 1% of its holding in two consecutive quarters, we stop including the fund family before the quarter. For example, Cambiar Investors accessed 1.9%, 3.3%, 0.0%, and 0.1% of its holdings in 2015Q4, 2015Q3, 2015Q2, and 2015Q1 respectively. Therefore, we exclude Cambiar Investors from our sample before June 2015.
df = counts.copy()

###### create a pivot table showing what percentage of market value each fund family saw
# Create a period column in 'YYYY_M' format
df['period'] = df['year'].astype(str) + '_' + df['month'].astype(str)
pivot_df = df.pivot(index='level', columns='period', values='hold_checked')

pivot_df = pivot_df.fillna(0).round(1)

# We sort by converting the column names back into (year, month) integers for proper ordering
def sort_key(col_name):
    year, month = map(int, col_name.split('_'))
    return (year, month)

sorted_columns = sorted(pivot_df.columns, key=sort_key, reverse=True)
lev_survive = pivot_df[sorted_columns].reset_index() ### level survive

# Now go through each column to see if its above the threshold
cols = [c for c in lev_survive.columns if c != 'level']
def get_date_tuple(c):
    y, m = map(int, c.split('_'))
    return (y, m)
sorted_cols = sorted(cols, key=get_date_tuple, reverse=True)

def calculate_valid_after(row):
    search_cols = [c for c in sorted_cols if get_date_tuple(c) <= (2016, 12)]
    
    found_period = None
    
    # Iterate through the columns to find two consecutive values <= 0.99
    for i in range(len(search_cols) - 1):
        curr_col = search_cols[i]
        next_col = search_cols[i+1]
        
        if row[curr_col] <= 0.99 and row[next_col] <= 0.99:
            found_period = curr_col
            break
            
    if not found_period:
        return "30-Dec-02"
    
    # 3. Convert period (YYYY_M) to Date String
    y_str, m_str = found_period.split('_')
    year = int(y_str)
    month_num = int(m_str)
    
    # Adjust month (9 becomes 6/Jun, 12 becomes 9/Sep, etc.)
    adjusted_month_num = month_num - 3
    if adjusted_month_num <= 0: # Handle edge case if rolling into previous year
        adjusted_month_num += 12
        year -= 1
        
    date_obj = pd.to_datetime(f"{year}-{adjusted_month_num}-30")
    return date_obj.strftime('%d-%b-%y')

# Apply the function
lev_survive['validafter'] = lev_survive.apply(calculate_valid_after, axis=1)
lev_survive = lev_survive[['validafter','level']]



## PART 3 Create How many times parent_name saw filings for each meeting
### 3.1 get the meeting details about when they were filed
import pandas as pd

meeting_details = pd.read_stata("MeetingDetails.dta") ## code in B1_MeetingDetails
meeting_details['cik'] = meeting_details['cik'].astype(int)
filing_date = pd.read_stata("filingdate.dta") ## this is available from SEC Companyidx and has fdate, cik column
df = pd.merge(meeting_details, filing_date, on='cik', how='left')

# remove bad combinations
df['dist'] = 10000
df['meetingdate'] = pd.to_datetime(df['meetingdate'])
df['fdate'] = pd.to_datetime(df['fdate'])
mask = df['meetingdate'] > df['fdate']
df.loc[mask, 'dist'] = (df['meetingdate'] - df['fdate']).dt.days

# keep only the record with the smallest 'dist' for each 'meetunq'
df = df.sort_values(by=['meetunq', 'dist'])
df = df.drop_duplicates(subset=['meetunq'], keep='first')
df.loc[df['fdate'] >= df['meetingdate'],'fdate'] = pd.NaT

# begin date 
df['bdate'] = df['fdate'] - pd.Timedelta(days=30)

# replace missing filing dates
mask_missing = df['fdate'].isna()
df.loc[mask_missing, 'bdate'] = df['meetingdate'] - pd.Timedelta(days=75)
unid = df[['cik', 'meetingdate', 'bdate']]


##### 3.2 Create one empty table for each parent_name (or level) we have ip with the cik and meeting dates
### Start by making empty tables with 4 columns - meetingdate, cik, bdate (beginning date), level
level = pd.read_csv('parent_ip_pseudo.csv') 
levellist = sorted(list(set(level['level']))) 

num = 0
for lev in levellist:
    print(lev)
    levtab = unid.copy()
    levtab['level'] = lev
    levtab.to_csv('maintable_empty/' + str(num) + '_empty.csv')
    num = num + 1


##### 3.3 Fill those empty tables with  IP activity for each parent_name
## Combine all the files downloaded in Part 1 to create dataframe of all IP activity
import os
import pandas as pd
allcsv = os.listdir('SEC/count') #

# get column names
csv = allcsv[1]
combinedactivity = pd.read_csv('SEC/count/' + csv) 
combinedactivity = combinedactivity.iloc[0:0] ## get column names

for csv in allcsv:
    onedaycount = pd.read_csv('SEC/count/' + csv) 
    combinedactivity = counts.append(onedaycount)


####### Run one for each parent_name (or level)
import os
import sys
import pandas as pd

if os.getcwd() == "":
    num = int(sys.argv[1])
else: 
    num = 99 ## here num tells you which parent_name you are running the table for
    
# the table we need to fill - fill table
ftab = pd.read_csv('maintable_empty/' + str(num) + '_empty.csv') 
ftab.dtypes
ftab['bdate'] = ftab['bdate'].astype('datetime64[ns]') 
ftab['meetingdate'] = ftab['meetingdate'].astype('datetime64[ns]')
lev = ftab['level'][0]

# filter table - based on quarter survival
filt_sur = lev_survive.copy() ## the dataframe we created above
filt_sur.dtypes
filt_sur['validafter'] = filt_sur['validafter'].astype('datetime64[ns]')
if lev not in filt_sur['level'].tolist():
    print(lev + ' not in list')
    sys.exit()
    
validafter = filt_sur.loc[filt_sur['level'] == lev, 'validafter'].values[0]
ftab = ftab[ftab['meetingdate'] > validafter] ## focus only on the meetings that happened after the IP adresses survive

###### filter combinedactivity to only the level we care about
countall = combinedactivity[combinedactivity['level'] == lev]
countall.dtypes
countall['date'] = countall['date'].astype('datetime64[ns]')
countall = countall[['cik', 'date', 'bcount']]

def getc(bd, ed, cik): #begining date, end date, cik
    df = countall.copy()
    df = df[df['cik'] == cik]
    df = df[df['date'] >= bd] 
    df = df[df['date'] <= ed] 
    return df['bcount'].sum()   

ftab['bcount'] = ftab.apply(lambda row: getc(row.bdate, row.meetingdate, row.cik), axis=1)
ftab = ftab[['meetingdate', 'cik', 'level', 'bcount']]
ftab.to_csv('maintable/' + str(num) + '.csv') 
print('maintable saved') 

### 3.4 Combine the maintables
import os
import pandas as pd
allcsv = os.listdir('maintable')

# get column names
csv = allcsv[1]
counts = pd.read_csv('maintable/' + csv)
counts = counts.iloc[0:0] ## get columns - meetingdate, cik, level, bcount

for csv in allcsv:
    count = pd.read_csv('maintable/' + csv) #count
    counts = counts.append(count)

counts.to_csv('InstitutionIPLogCounts.csv') 

